last updated: July 27 2025
Manuscript Title:
Authors: Note: Microsoft copilot was used to clean up code.
Affiliations:
Description of the dataset:
Here, we seek to evaluate the effect of a diet on the microbiome as well its ability to buffer against effects induced by Candida colonization.
To accomplish this, we read in the following data: 1. phyloseq object generated by Cecilia from merged reads after taxonomic glom to the rank of genus 2. phyloseq object generated by Cecilia from merged reads
Here, we render the following figures:
Figure 1 Supplementary Figure 1 Supplementary Figure 2 Supplementary Figure 3 —
library(knitr)
opts_chunk$set(fig.width=12, fig.height=8,
echo=TRUE, warning=FALSE, message=FALSE, error = FALSE)
Load R packages
# Set seed
set.seed(78979)
# Define required packages
packages <- c(
# Core tidyverse packages
"tidyverse",
# Phylogenetics and microbiome analysis
"phyloseq",
"microbiome",
"phangorn",
"msa",
# Statistical modeling and analysis
"lmerTest",
"rstatix",
"broom",
"broom.mixed",
"genefilter",
"vegan",
"ALDEx2",
# Visualization enhancements
"ggpubr",
"ggprism",
"gridExtra",
"ggbeeswarm",
"ggrepel",
"cowplot",
"ggtext",
# Data import
"readxl"
)
# Install missing CRAN packages
cran_packages <- setdiff(packages, rownames(installed.packages()))
cran_packages <- cran_packages[!cran_packages %in% c("phyloseq", "microbiome", "phangorn", "msa", "genefilter", "vegan", "ALDEx2")]
if(length(cran_packages) > 0) install.packages(cran_packages)
# Install missing Bioconductor packages
bioc_packages <- c("phyloseq", "microbiome", "phangorn", "msa", "genefilter", "vegan", "ALDEx2")
bioc_to_install <- setdiff(bioc_packages, rownames(installed.packages()))
if(length(bioc_to_install) > 0) BiocManager::install(bioc_to_install, update = FALSE)
# Load packages
n <- length(packages) - sum(sapply(packages, require, character.only = TRUE))
# Print loading status
if (n == 0) {
print("All necessary R packages loaded properly")
} else {
print(paste0(n, " R packages did not load properly"))
}
## [1] "All necessary R packages loaded properly"
#set graphs to the graphpad prism theme
theme_set(theme_prism())
#read in the phyloseq object
ps = readRDS("~/Desktop/perez/PerezDiet_merged_Len250_EE5_5_RefSeq_phyGenus_tre.rds")
tax_df = data.frame(ps@tax_table@.Data)
#make the mapping file
map = read.csv("~/Desktop/perez/Updated_metadata.csv")
rownames(map) = map$SampleID
#dim(map)
#fix the day labels
map$Day = as.factor(as.character(map$Day))
map$Time_label <- plyr::revalue(map$Day, c(
"-8" = "T1",
"-7" = "T1",
"-1" = "T2",
"10" = "T3",
"12" = "T3",
"13" = "T3"))
#change from fatty acid diet to "HOA"
map$Diet <- plyr::revalue(map$Diet, c("FattyAcid"="HOA", "Control"="control (AIN-93G)"))
sample_data(ps) <- sample_data(map)
we will look at these plots a few ways. in the first iteration, we will look at the data with the x axis ordered by cage number and then mouse number 1:2
# Transform to relative abundance
ps_relabund <- transform_sample_counts(ps, function(x) x / sum(x))
# Identify top 15 genera by total abundance
TopNOTUs <- names(sort(taxa_sums(ps_relabund), TRUE)[1:15])
top15 <- prune_species(TopNOTUs, ps_relabund)
tax = data.frame(top15@tax_table@.Data)
top_genera = unique(tax$Genus)
# Melt to long format
df_long <- psmelt(ps_relabund)
# Collapse all other genera into "Other"
df_long <- df_long %>%
mutate(Genus = ifelse(Genus %in% top_genera, Genus, "Other"))
# Define 16 visually distinct hex colors
palette <- c(
"#1f77b4",
"#ff7f0e",
"#2ca02c",
"#d62728",
"#9467bd",
"#8c564b",
"#e377c2",
"#7f7f7f",
"#bcbd22",
"#17becf",
"#aec7e8",
"#ffbb78",
"#98df8a",
"#ff9896",
"#c5b0d5",
"#c49c94"
)
#define time point colors
time_colors <- c("#0072B2", "#E69F00", "#009E73")
#make a new factor
df_long$Plot_factor = paste0(df_long$Cage, ";", df_long$Day, ";", df_long$Mouse)
# Plot
#ggplot(df_long, aes(x = Plot_factor, y = Abundance, fill = Genus)) +
# geom_bar(stat = "identity", alpha = 0.8) +
# facet_wrap(Group ~ Diet, scales = "free_x", ncol=2) +
# scale_fill_manual(values = palette) +
# labs(
# x = "Sample (Ordered by Target Genera Abundance)",
# y = "Relative Abundance"
# ) +
# theme_minimal() +
#theme(axis.text.x = element_text(angle = 90, hjust = 1))
lets look to see if sequencing depth differs between fatty acid and control
we can see that the time point 1 samples have less reads than the other time points, though this difference is only marginally significant.
df = data.frame(Depth = sample_sums(ps), sample_data(ps))
df$My.group = ifelse(df$Group %in% c("G1", "G2"), "pre", "post")
df$Time_label = factor(df$Time_label, levels=c("T1", "T2", "T3"))
p = ggplot(df, aes(Time_label, Depth)) + geom_boxplot() + facet_wrap(~Diet) + stat_compare_means()
p
if we drop the day -8 samples than this difference goes away. there is no significant difference between sequencing depth of the pre and post samples for the diet comparisons.
df1 = subset(df, Day != "-8")
df1$Time_label = factor(df1$Time_label, levels=c("T1", "T2", "T3"))
pa = ggplot(df1, aes(Time_label, Depth)) + geom_boxplot() + facet_wrap(~Diet) + stat_compare_means()
pb = ggplot(df1, aes(My.group, Depth)) + geom_boxplot() + facet_wrap(~Diet) + stat_compare_means()
pc = ggplot(df1, aes(Diet, Depth)) + geom_boxplot() + stat_compare_means()
grid.arrange(pa, pb, pc, ncol=3)
now let’s look at this instead based on the relative abundance of taxa later identified as differentiating days and by diet
#re order based on relative abundance of lactobacillus, bacteroides and Barnesiella
target_genera <- c("Lactobacillus", "Bacteroides", "Barnesiella")
#subset on the target genera
myphy = subset_taxa(ps, Genus %in% target_genera)
#sum the abundances of the target taxa and create a rank based measure of the samples most (1) to least abundant with these taxa
df = data.frame(SampleID = sample_names(ps), TargetOrgAbundance=sample_sums(ps)) %>%
arrange(desc(TargetOrgAbundance))
df$Myorder = 1:nrow(df)
#merge the annotation of rank order for the target taxa with the tax table
df_long_annotated = merge(df, df_long)
df_long_annotated$Myorder <- as.numeric(as.character(df_long_annotated$Myorder))
# Make sure Time_label is a factor with desired order
df_long_annotated$Time_label <- factor(df_long_annotated$Time_label, levels = c("T1", "T2", "T3"))
# Make sure Myorder is a factor with unique levels in order
df_long_annotated$Myorder <- factor(df_long_annotated$Myorder, levels = unique(df_long_annotated$Myorder))
#make the plot
fig1D <- ggplot(df_long_annotated, aes(x = Myorder, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity", position = "stack", alpha=0.9) +
scale_fill_manual(values = palette) +
labs(
x = "Sample (Ordered by Target Genera Abundance)",
y = "Relative Abundance"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(), # hide x axis text
axis.ticks.x = element_blank(), # hide x axis ticks
legend.text = element_text(face = "italic"),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ Time_label + Diet, scales = "free_x", ncol = 2)
fig1D
#save the figures
ggsave(fig1D, file="~/Desktop/github/perez_diet/fig1D.pdf", device="pdf", width=7, height=10)
ggsave(fig1D, file="~/Desktop/github/perez_diet/fig1D.eps", device="eps", width=6, height = 10)
To calculate alpha diversity, we have to use the phyloseq object straight from dada2 since some diversity metrics use singletons in their calculation of diversity. As a result, we will read in the original phyloseq object prior to aggregation to the genus level.
There is a consistent drop in microbial diversity across all diversity metrics in the Control group over time, when comparing T1 vs. T2 and T1 vs. T3. Drops in observed species as well as eveness were seen in the control diet and persisted across the observation window.
The HOA group exhibited a universal trend toward reduced richness and diversity. However, only Simpspon was significant at T1 vs T2 with no difference at T1 vs T3, suggesting that we likely will see an increase in the relative abundance of predominant species in the HOA diet, rather than an overall change in the number of species or abundance of rare taxa, as we see with the controls. Moreover, this change is not so marked or persistent, suggesting the fatty acid diet is stabilizing microbial diversity.
phy <- readRDS("~/Desktop/perez/PerezDiet_merged_Len250_EE5_5_RefSeq.rds")
#update the metadata with cage information
map = read.csv("~/Desktop/perez/Updated_metadata.csv")
rownames(map) = map$SampleID
#fix the labels
map$Time_label <- plyr::revalue(as.factor(map$Day), c(
"-8" = "T1",
"-7" = "T1",
"-1" = "T2",
"10" = "T3",
"12" = "T3",
"13" = "T3"))
#change from fatty acid diet to "HOA"
map$Diet <- plyr::revalue(map$Diet, c("FattyAcid"="HOA", "Control"="control (AIN-93G)"))
sample_data(phy) <- sample_data(map)
#estimate species richness
df = data.frame(estimate_richness(phy, measures=c("Observed", "Chao1", "Shannon", "Simpson")), sample_data(phy)) %>%
reshape2::melt(., c("se.chao1", colnames(map)))
# Generate all pairwise combinations
time_points = c("T1", "T2", "T3")
mycomparisons <- combn(time_points, 2, simplify = FALSE)
#force the ordering of the x axis
ordering = c("T1", "T2", "T3")
df$Time_label = factor(df$Time_label, levels=ordering)
# Define pairwise time comparisons
time_points <- list(c("T1", "T2"), c("T1", "T3"), c("T2", "T3"))
# ----- 1. Pairwise Wilcoxon tests: Within each diet -----
df2 <- df %>%
dplyr::rename(diversity_measure = variable)
within_diet_pvals <- df2 %>%
group_by(diversity_measure, Diet) %>%
pairwise_wilcox_test(
value ~ Time_label,
p.adjust.method = "BH",
comparisons = time_points
) %>%
mutate(comparison_type = "Within Diet",
group = Diet) %>%
add_y_position()
# ----- 2. Wilcoxon tests: Between diets at each time point -----
between_diet_pvals <- df2 %>%
group_by(diversity_measure, Time_label) %>%
wilcox_test(value ~ Diet) %>% # no p.adjust.method here
ungroup() %>%
group_by(diversity_measure) %>%
mutate(p.adj = p.adjust(p, method = "BH")) %>%
add_significance() %>%
mutate(comparison_type = "Between Diets",
group = Time_label) %>%
# Specify the data and formula here:
add_y_position(data = df2, formula = value ~ Diet) %>%
dplyr::select(c("Time_label", "diversity_measure", "group1", "group2", "p", "p.adj", "p.adj.signif"))
write.csv(between_diet_pvals, file="~/Desktop/github/perez_diet/between_diet_same_timepoint_pvalues.csv")
# ----- 4. Plot -----
within_diet_pvals2 <- within_diet_pvals %>%
mutate(y.position = case_when(
diversity_measure == "Simpson" ~ seq(1.01, 1.001, length.out = n()),
diversity_measure == "Shannon" ~ seq(6.9, 7.5, length.out = n()),
diversity_measure == "Chao1" ~ seq(1600, 2300, length.out = n()),
diversity_measure == "Observed"~ seq(1400, 2100, length.out = n()),
TRUE ~ y.position
))
#define a function to generate the pvalues for each of the diversity metrics independently so the p values aren't overplotted
plot_diversity_metric <- function(data, pval_data, diversity_metric, time_colors) {
# Filter the data and p-values for the selected diversity metric
df_sub <- data %>% filter(diversity_measure == diversity_metric)
pval_sub <- pval_data %>% filter(diversity_measure == diversity_metric)
# Custom y.position adjustment depending on metric
pval_sub <- pval_sub %>%
group_by(Diet) %>%
mutate(
# Stagger brackets by adding 0.5 per row index
y.position = case_when(
diversity_metric == "Shannon" ~ y.position,
diversity_metric == "Simpson" ~ seq(0.9999, by = 0.0001, length.out = n()),
diversity_metric == "Chao1" ~ seq(max(df_sub$value) * 1.05, by = 50, length.out = n()),
diversity_metric == "Observed" ~ seq(max(df_sub$value) * 1.05, by = 100, length.out = n()),
TRUE ~ y.position
)
) %>%
ungroup()
# Build the plot
ggplot(df_sub, aes(x = Time_label, y = value, fill = Time_label)) +
geom_boxplot(outlier.shape = NA, alpha = 0.8) +
geom_beeswarm(dodge.width = 0.8, cex = 1) +
stat_pvalue_manual(
pval_sub,
label = "p.adj.signif",
tip.length = 0.01,
bracket.size = 0.4,
size = 3
) +
facet_grid(rows = vars(diversity_measure), cols = vars(Diet), scales = "free") +
scale_fill_manual(values = time_colors) +
theme_minimal(base_family = "Helvetica")+
labs(
x = "",
y = diversity_metric
) +
theme(
legend.position = "none",
plot.title = element_text(face = "italic", hjust = 0.5),
strip.text.y = element_blank()
)
}
SFig1A = plot_diversity_metric(df2, within_diet_pvals2, "Observed", time_colors)
SFig1B = plot_diversity_metric(df2, within_diet_pvals2, "Simpson", time_colors)
SFig1C = plot_diversity_metric(df2, within_diet_pvals2, "Chao1", time_colors)
SFig1 = cowplot::plot_grid(SFig1A, SFig1B, SFig1C, ncol=1, labels=c("A", "B", "C"))
SFig1
ggsave(SFig1, file="~/Desktop/github/perez_diet/SFig1.pdf", device="pdf", width = 6, height = 11)
Let’s look at Shannon Diversity
fig1B = plot_diversity_metric(df2, within_diet_pvals2, "Shannon", time_colors)
fig1B
ggsave(fig1B, file="~/Desktop/github/perez_diet/fig1B.pdf", device="pdf", width = 6, height = 6)
ggsave(fig1B, file="~/Desktop/github/perez_diet/fig1B.eps", device="eps", width = 6, height = 6)
To further refine the dataset, we applied both prevalence and abundance thresholds, removing taxa that were present in fewer than 2 samples and had fewer than 100 total reads. This filtering step reduced the number of genera from 182 to just 46, allowing us to focus on the more consistently represented and biologically relevant taxa.
We do this filtering after calculating alpha diversity since some diversity metrics rely on the presence of singletons to estimate the “unseen” diversity.
#let's filter out taxa that aren't present in at least 2 samples at 100 reads
flist <- filterfun(kOverA(2, 100))
phy_filt <- filter_taxa(ps, flist, TRUE) #down to 46 genera if 100 reads required;
phy_filt
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 46 taxa and 72 samples ]
## sample_data() Sample Data: [ 72 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 46 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 46 tips and 45 internal nodes ]
#transform clr
clr_phy = transform_sample_counts(phy_filt, function(x) compositions::clr(x))
clr_mat = data.frame(as.matrix(otu_table(clr_phy)))
clr_mat$SampleID = rownames(clr_mat)
We begin by exploring a PCoA ordination based on Bray-Curtis dissimilarity to assess overall community structure.
Panel A shows little separation between the HOA (triangles) and Control diet (circles) groups on the first two principal components.
In Panel B, however, we observe a tendency for early time point (t1, t2) samples to cluster toward positive values on Axis 3, while t1 samples shift toward negative values, indicating a temporal effect.
Panel C confirms that the difference in Axis 3 scores between pre- and post-treatment samples is statistically significant. Additionally, the post-treatment group exhibits greater dispersion, which may reflect increased heterogeneity or simply a larger number of samples.
When we look at between group differences for G1 to G2 we see marginal significance when looking at Axis 3 for G1 vs. G2 but no differences for G3 vs G4
# Ordination and data prep
ord <- ordinate(phy_filt, method = "PCoA", distance = "bray")
df <- data.frame(sample_data(phy_filt), ord$vectors)
df$Mygroup <- ifelse(df$Group %in% c("G1", "G2"), "pre", "post")
df$Time_label <- factor(df$Time_label, levels = c("T1", "T2", "T3"))
# Eigenvalues and proportion explained
prop_explained <- round(100 * ord$values$Eigenvalues / sum(ord$values$Eigenvalues), 2)
# Color palette
mycolors <- c(
G1 = "#D6CDEA",
G2 = "#B39DDB",
G3 = "#C8E6C9",
G4 = "#81C784",
G5 = "#FFD1DC",
G6 = "#F48FB1"
)
# Helper function for axis labels
axis_label <- function(axis_num) {
paste0("Axis ", axis_num, ": ", prop_explained[axis_num], " %")
}
# Base plot layers for boxplots with stats & points
base_boxplot <- function(data, x, y, color_group, facet_var = NULL, y_label = NULL, hide_legend = FALSE) {
p <- ggplot(data, aes_string(x = x, y = y, color = color_group)) +
geom_boxplot() +
geom_point(position = position_jitter(width = 0.1)) +
stat_compare_means()
if (!is.null(facet_var)) {
p <- p + facet_wrap(as.formula(paste("~", facet_var)))
}
if (!is.null(y_label)) p <- p + ylab(y_label)
if (hide_legend) p <- p + theme(legend.position = "none")
p
}
########## Scatter plots
#Axis 1 vs 2
p1 <- ggplot(df, aes(Axis.1, Axis.2, color = Time_label, shape=Diet)) +
geom_point() +
xlab(axis_label(1)) + ylab(axis_label(2)) +
scale_color_manual(values=time_colors)
#Axis 1 vs 3
p2 <- ggplot(df, aes(Axis.1, Axis.3, color = Time_label, shape = Diet)) +
geom_point() +
xlab(axis_label(1)) + ylab(axis_label(3))+
scale_color_manual(values=time_colors)
# Boxplots with faceting and stats
df$Mygroup_plotting = plyr::revalue(df$Mygroup, c("post"="Post", "pre"="Pre"))
df$Mygroup_plotting = factor(df$Mygroup_plotting, levels=c("Pre", "Post"))
p3 <- base_boxplot(df, "Mygroup_plotting", "Axis.3", "Mygroup_plotting",
facet_var = "Diet", y_label = axis_label(3), hide_legend=TRUE) +
xlab("")
#get rid of the pre-time period and see if there's significance -- there is on axis 3
#not shown here but no distinction on axis 1 or 2
p4 <- base_boxplot(subset(df, Mygroup_plotting != "Pre"), "Time_label", "Axis.3", "Time_label",
facet_var = "Diet", y_label = axis_label(3), hide_legend = TRUE) +
scale_color_manual(values=c("#84948E","#A68B6B"))
# Pairwise comparisons subset plots
#G1 vs G2 is marginally significant
g1g2a3 <- df %>%
filter(Group %in% c("G1", "G2")) %>%
base_boxplot("Group", "Axis.3", "Group", y_label = axis_label(3))
g3g4a1 <- df %>%
filter(Group %in% c("G3", "G4")) %>%
base_boxplot("Group", "Axis.4", "Group", y_label = axis_label(4))
# Combine plots
plot_grid(
p1, p2, p3, p4,
labels = c("A", "B", "C", "D"),
ncol = 2,
align = "hv"
)
To further explore community structure, we applied UniFrac, a distance metric that incorporates both taxonomic abundance and phylogenetic relationships among taxa. When we look at the ordination on axis 1 and 2, we see that there does seem to be some segregation samples by time for the control group, but no distinct clustering. The T1 points have more negative scores along axis 2 than T2 or T3. There is a pattern that looks like a gradient (i.e. the horseshoe) but this does not correspond to a difference in sequencing depth.
Using PERMANOVA (adonis), if we do not consider cage effects, we found that diet was not a significant factor, but there was a significant difference between pre- and post-treatment samples, as well as an interaction between diet and time, indicating a temporal shift in microbial composition that is dependent on the diet.
When we do permutations considering cage effects, we see that all three variables are significant. This tells us that samples from the same cage are more similar to each other (are not independent). Controlling for cage reduces noise and suggest that diet really matters. This tells us diet influences how the community changes over time, but this effect only becomes clear when controlling for cage effects.
Time F=5, Diet = 0.9, Timexdiet interaction is 2.8
# Define Main.Factor based on Group
sample_data(phy_filt)$Main.Factor <- ifelse(
sample_data(phy_filt)$Group %in% c("G1", "G2"),
"pre",
"post"
)
# Convert Day to factor
sample_data(phy_filt)$Day <- as.factor(sample_data(phy_filt)$Day)
# Extract metadata as data frame
metadata <- data.frame(sample_data(phy_filt))
# Calculate weighted, normalized UniFrac distance
unifrac_dist <- UniFrac(phy_filt, weighted = TRUE, normalized = TRUE, fast = TRUE)
# Perform PCoA ordination based on UniFrac distance
ord <- ordinate(phy_filt, method = "PCoA", distance = unifrac_dist)
# Create combined data frame for plotting
df <- data.frame(
ord$vectors,
metadata,
Depth = sample_sums(phy_filt)
)
# Convert Time_label to numeric for plotting size or ordering if needed
df$Time_label <- factor(df$Time_label, levels=c("T1", "T2", "T3"))
# Plot PCoA with points sized by sequencing depth, colored by Time_label, shaped by Diet
#use a defined color scheme
my_colors <- c("#0072B2", "#E69F00", "#009E73")
#make the figure
Fig1C <- ggplot(df, aes(x = Axis.1, y = Axis.2, shape = Diet, color = Time_label, size = Depth)) +
geom_point(alpha = 0.8) +
facet_wrap(~ Diet) +
xlab(axis_label(1)) + ylab(axis_label(2)) +
scale_color_manual(values = my_colors) +
guides(
shape = guide_legend(order = 1),
color = guide_legend(order = 2),
size = guide_legend(order = 3)
) +
theme(text = element_text(family = "Helvetica"))
print(Fig1C)
ggsave(Fig1C, file="~/Desktop/github/perez_diet/Fig1C.pdf", device="pdf", width = 8, height = 6)
let’s use a permutational analysis of variance to test for significance
set.seed(789)
# PERMANOVA to test effects of Time_label and Diet on UniFrac distances
adonis_result <- adonis2(
unifrac_dist ~ Time_label * Diet,
data = metadata,
permutations = 999, by="term",
strata = as.factor(metadata$Cage)
)
print(adonis_result)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = unifrac_dist ~ Time_label * Diet, data = metadata, permutations = 999, by = "term", strata = as.factor(metadata$Cage))
## Df SumOfSqs R2 F Pr(>F)
## Time_label 2 0.19571 0.12231 5.0566 0.003 **
## Diet 1 0.01774 0.01109 0.9169 0.002 **
## Time_label:Diet 2 0.10948 0.06842 2.8287 0.036 *
## Residual 66 1.27720 0.79819
## Total 71 1.60013 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We also assessed beta dispersion to test for differences in within-group variability, testing within groups (pre, post) as well as by diet and by Time_label. The results were not significant, suggesting that the observed differences in community composition are not due to differences in variability, but rather reflect true compositional shifts. The post samples may have greater dispersion because of the greater number of samples in that group
bd <- betadisper(unifrac_dist, metadata$Diet)
anova(bd)
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 1 0.002323 0.0023232 0.5262 0.4706
## Residuals 70 0.309047 0.0044150
plot(bd)
what happens when we drop the pre samples? when we drop the pre samples, then time is not significant. diet is significant, and the time x diet interaction is significant.
myphy = subset_samples(phy_filt, Main.Factor != "pre") %>%
prune_taxa(taxa_sums(.) > 0, .)
# Convert Day to factor
sample_data(myphy)$Day <- as.factor(sample_data(myphy)$Day)
# Extract metadata as data frame
metadata_post <- data.frame(sample_data(myphy))
# Calculate weighted, normalized UniFrac distance
unifrac_dist_post <- UniFrac(myphy, weighted = TRUE, normalized = TRUE, fast = TRUE)
# Perform PCoA ordination based on UniFrac distance
ord_post <- ordinate(myphy, method = "PCoA", distance = unifrac_dist_post)
# Create combined data frame for plotting
df_post <- data.frame(
ord_post$vectors,
metadata_post,
Depth = sample_sums(myphy)
)
# Convert Time_label to numeric for plotting size or ordering if needed
df_post$Time_numeric <- as.numeric(as.factor(df_post$Time_label))
# Plot PCoA with points sized by sequencing depth, colored by Time_label, shaped by Diet
p <- ggplot(df_post, aes(x = Axis.1, y = Axis.2, color = Time_label, shape = Diet)) +
geom_point(size=3) +
facet_wrap(~ Diet)+
guides(
shape = guide_legend(order = 1), # Diet first
color = guide_legend(order = 2) # Time second
)
print(p)
# PERMANOVA to test effects of Time_label and Diet on UniFrac distances
adonis_result_post <- adonis2(
unifrac_dist_post ~ Time_label * Diet,
data = metadata_post,
permutations = 999, by="term",
strata = as.factor(metadata_post$Cage)
)
print(adonis_result_post)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = unifrac_dist_post ~ Time_label * Diet, data = metadata_post, permutations = 999, by = "term", strata = as.factor(metadata_post$Cage))
## Df SumOfSqs R2 F Pr(>F)
## Time_label 1 0.03019 0.02692 1.3343 0.240
## Diet 1 0.01087 0.00970 0.4806 0.062 .
## Time_label:Diet 1 0.08487 0.07567 3.7506 0.040 *
## Residual 44 0.99562 0.88772
## Total 47 1.12155 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Let’s examine how pre- (baseline) and post-treatment samples segregate by diet along Axis 1 and Axis 2. We return to the complete dataset. In the Control group, we observe a clear separation between pre- and post-treatment samples along Axis 1, suggesting a temporal shift in community composition. In contrast, for the Fatty Acid group, this separation is more apparent along Axis 2, indicating that the dietary effect may manifest along a different compositional gradient. This is also detected for the control group on axis 2.
# Combine PCoA ordination vectors with sample metadata
UniFrac.df <- data.frame(ord$vectors, sample_data(phy_filt))
#get the eigenvalues
eig = 100*(ord$values$Eigenvalues/sum(ord$values$Eigenvalues))
# Set Time_label as an ordered factor
UniFrac.df$Time_label <- factor(UniFrac.df$Time_label, levels = c("T1", "T2", "T3"))
# Boxplot of Axis.1 by Time_label, filled by Main.Factor, faceted by Diet
p1 <- ggplot(UniFrac.df, aes(x = Time_label, y = Axis.1, fill = Time_label)) +
geom_boxplot(alpha=0.8) +
facet_wrap(~ Diet) +
stat_compare_means() +
theme(legend.position = "none") +
ylab(paste0("Axis 1: ", round(eig[1], 2), "%")) + xlab("") +
scale_fill_manual(values=my_colors)
# Boxplot of Axis.2 by Time_label, filled by Main.Factor, faceted by Diet
p2 <- ggplot(UniFrac.df, aes(x = Time_label, y = Axis.2, fill = Time_label)) +
geom_boxplot(alpha=0.8) +
facet_wrap(~ Diet) +
stat_compare_means() +
theme(legend.position = "none") +
ylab(paste0("Axis 2: ", round(eig[2], 2), "%")) + xlab("") +
scale_fill_manual(values=my_colors)
# Arrange plots vertically
gridExtra::grid.arrange(p1, p2, ncol = 1)
To identify taxa potentially driving the patterns observed in the ordination, we examined correlations between species abundances and the first two ordination axes. Abundance data were first transformed using the centered log-ratio (CLR) method to address compositionality. We then performed separate linear regressions of each taxon’s CLR-transformed abundance against Axis 1 and Axis 2 scores.
To account for multiple testing, we applied a false discovery rate (FDR) correction with a significance threshold of 0.05. This analysis identified 13 taxa significantly associated with at least one axis, suggesting their potential role in the observed shifts in community structure.
The taxa identified were: Bacteroides, Barnesiella, Bifidobacterium, Clostridium IV, Clostridium XIVa, Clostridium XVIII, Lactobacillus, Mucispirillum, Parabacteroides, Prevotella, Romboutsia, and Turicibacter. Among these, Lactobacillus, Barnesiella, Clostridium XIVa, and Prevotella showed the largest effect sizes.
# Prepare ordination data frame with SampleID and first two axes
ordination_df <- as.data.frame(ord$vectors) %>%
rownames_to_column("SampleID") %>%
dplyr::select(SampleID, Axis.1, Axis.2)
# Join CLR-transformed taxa data with ordination coordinates
clr_ord_df <- left_join(ordination_df, clr_mat, by = "SampleID")
# Identify taxa columns by excluding ordination columns
taxa_cols <- setdiff(colnames(clr_ord_df), c("SampleID", "Axis.1", "Axis.2"))
# Function to run linear regression of each taxon against a specified axis
run_taxon_regression <- function(axis, df, taxa_cols) {
map_dfr(taxa_cols, function(taxon) {
model <- lm(as.formula(paste(axis, "~", taxon)), data = df)
tidy(model) %>%
filter(term != "(Intercept)") %>%
mutate(Axis = axis, Taxon = taxon)
})
}
# Run regressions for Axis.1 and Axis.2 and adjust p-values for multiple testing
axis1_results <- run_taxon_regression("Axis.1", clr_ord_df, taxa_cols) %>%
mutate(p.adj = p.adjust(p.value, method = "fdr"))
axis2_results <- run_taxon_regression("Axis.2", clr_ord_df, taxa_cols) %>%
mutate(p.adj = p.adjust(p.value, method = "fdr"))
# Combine results from both axes
results_all <- bind_rows(axis1_results, axis2_results)
# Prepare taxonomy data frame and join with regression results
tax_df <- data.frame(phy_filt@tax_table@.Data, Taxon = rownames(tax_table(phy_filt)))
results_annotated <- left_join(results_all, tax_df, by = "Taxon")
# Prepare volcano plot data from regression results
volcano_data <- results_annotated %>%
mutate(
neg_log10_padj = -log10(p.adj),
Significant = p.adj < 0.05
)
# Basic volcano plot
ggplot(volcano_data, aes(x = estimate, y = neg_log10_padj, label=Genus)) +
geom_point(aes(color = Significant), alpha = 0.7) + geom_text_repel()+
scale_color_manual(values = c("grey60", "red")) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
labs(
x = "Effect Size (Estimate)",
y = "-log10(Adjusted P-value)",
color = "Significant"
) +
theme_minimal() +
theme(
legend.position = "top"
)
#we see that there are 13 taxa that are significantly correlated with axis 1 or 2
foo = subset(volcano_data, p.adj < 0.05)
table(foo$Genus)
##
## Bacteroides Barnesiella Bifidobacterium Clostridium IV
## 1 1 1 1
## Clostridium XlVa Clostridium XVIII Lactobacillus Mucispirillum
## 2 1 1 1
## Parabacteroides Prevotella Romboutsia Turicibacter
## 1 1 1 1
To better understand whether the interaction between time and diet is primarily driven by differences between pre- (t1) and post-treatment (t2, t3) samples, we repeated the PERMANOVA analysis after excluding the pre-treatment (T1) samples and controlling for cage effects. In this post-treatment subset (T2 and T3), the time × diet interaction remained marginally significant, while the main effect of time was not, and the effect of diet was marginally significant. This suggests that microbial communities shift differently from T2 to T3 depending on the diet.
phy_filt_subset = subset_samples(phy_filt, Main.Factor=="post") %>%
prune_taxa(taxa_sums(.) > 0, .) %>%
prune_samples(sample_sums(.) > 0, .)
metadata = data.frame(sample_data(phy_filt_subset))
unifrac_dist <- UniFrac(phy_filt_subset, weighted = TRUE, normalized = TRUE, fast = TRUE)
ord <- ordinate(phy_filt_subset, method = "PCoA", distance = unifrac_dist)
df = data.frame(ord$vectors, sample_data(phy_filt_subset), Depth=sample_sums(phy_filt_subset))
df$Time_numeric = as.numeric(as.factor(df$Time_label))
p = ggplot(df, aes(Axis.1, Axis.2, color=Group, shape=Diet)) + geom_point()
p = ggplot(df, aes(Axis.1, Axis.2, color=Time_label, shape=Diet)) + geom_point() + facet_wrap(~Diet)
#note that diet is not significant
#day is significant
adonis_result <- adonis2(unifrac_dist ~ Time_label*Diet, data = metadata, by="term")
print(adonis_result)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = unifrac_dist ~ Time_label * Diet, data = metadata, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## Time_label 1 0.03019 0.02692 1.3343 0.255
## Diet 1 0.01087 0.00970 0.4806 0.551
## Time_label:Diet 1 0.08487 0.07567 3.7506 0.040 *
## Residual 44 0.99562 0.88772
## Total 47 1.12155 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis_result <- adonis2(
unifrac_dist ~ Time_label * Diet,
data = metadata,
permutations = 999, by="term",
strata = as.factor(metadata$Cage)
)
print(adonis_result)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = unifrac_dist ~ Time_label * Diet, data = metadata, permutations = 999, by = "term", strata = as.factor(metadata$Cage))
## Df SumOfSqs R2 F Pr(>F)
## Time_label 1 0.03019 0.02692 1.3343 0.226
## Diet 1 0.01087 0.00970 0.4806 0.060 .
## Time_label:Diet 1 0.08487 0.07567 3.7506 0.042 *
## Residual 44 0.99562 0.88772
## Total 47 1.12155 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now we want to do more hypothesis testing rather than explore the data. Towards this end, we will use an ordination that let’s us quantify the amount of variation in the data that the factors diet and time explain. We’ll assess how much variation is explained by diet, whether there’s a temporal shift in community structure, and whether diet and time interact in any interesting ways. Importantly, what we do here is model variation across diet and time, rather than looking at total variance
However we don’t control for cage effects. In this analysis, we see that the time effect and the time by diet interaction is significant.
# Step 1: Transform to relative abundance
ps_rel <- transform_sample_counts(phy_filt, function(x) x / sum(x))
# Step 2: Extract OTU matrix (samples as rows, taxa as columns)
otu_mat <- as(otu_table(ps_rel), "matrix")
if (taxa_are_rows(ps_rel)) otu_mat <- t(otu_mat)
# Step 3: Extract sample metadata and map cage IDs to cage names
meta <- as(sample_data(ps_rel), "data.frame")
id_to_cage <- c(
"82084" = "Cage1", "82085" = "Cage2", "82086" = "Cage3", "82087" = "Cage4",
"85061" = "Cage5", "85062" = "Cage6", "85063" = "Cage7", "85064" = "Cage8",
"104184" = "Cage9", "104185" = "Cage10", "104186" = "Cage11", "104187" = "Cage12"
)
meta$Cage.new <- plyr::revalue(as.factor(meta$Cage), id_to_cage)
# Step 4: Match samples between OTU matrix and metadata
common_samples <- intersect(rownames(otu_mat), rownames(meta))
otu_mat_clean <- otu_mat[common_samples, ]
meta_clean <- meta[common_samples, ]
meta_clean$SampleID <- rownames(meta_clean)
# Step 5: Remove samples with missing Diet or Day
meta_clean <- meta_clean %>% filter(!is.na(Diet), !is.na(Day))
otu_mat_clean <- otu_mat_clean[rownames(meta_clean), ]
# Step 6: Compute Bray–Curtis distance matrix
bray_dist_clean <- vegdist(otu_mat_clean, method = "bray")
# Step 7: Run db-RDA (constrained ordination)
meta_clean$Group <- relevel(factor(meta_clean$Group), ref = "G1")
db_rda <- capscale(bray_dist_clean ~ Time_label * Diet, data = meta_clean, comm = otu_mat_clean)
# Extract eigenvalues and percent variance explained
eig_vals <- eigenvals(db_rda)
prop_explained <- round(100 * (eig_vals / sum(eig_vals)), 2)
# Step 8: Summarize and test db-RDA model
#summary(db_rda)
anova.cca(db_rda, permutations = 999, by = "term")
## Permutation test for capscale under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: capscale(formula = bray_dist_clean ~ Time_label * Diet, data = meta_clean, comm = otu_mat_clean)
## Df SumOfSqs F Pr(>F)
## Time_label 2 0.6402 6.1624 0.001 ***
## Diet 1 0.0548 1.0544 0.353
## Time_label:Diet 2 0.1854 1.7845 0.058 .
## Residual 66 3.4282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extract taxon scores
taxon_scores <- scores(db_rda, display = "species")
# Step 9: Extract sample scores and join metadata for plotting
sample_scores <- scores(db_rda, display = "sites") %>%
as.data.frame() %>%
rownames_to_column("SampleID")
plot_df <- left_join(sample_scores, meta_clean, by = "SampleID")
# Step 10: Define factor for pre/post groups
plot_df$myfactor <- ifelse(plot_df$Group %in% c("G1", "G2"), "pre", "post")
plot_df$Time_numeric <- as.numeric(as.factor(plot_df$Time_label))
# Step 11: PCoA scatterplot colored by pre/post and shaped by Diet
dba_plot <- ggplot(plot_df, aes(x = CAP1, y = CAP2, color = myfactor, shape = Diet)) +
geom_point(alpha = 0.8, size = 2) +
labs(
x = paste0("CAP1: ", prop_explained[1], " %"),
y = paste0("CAP2: ", prop_explained[2], " %"),
color = "Group"
) +
theme_minimal()
# Step 12: Boxplots comparing groups with stats
# Helper function for group comparisons
plot_group_boxplot <- function(data, group_var, y_var = "CAP1", facet_var = NULL, title = NULL) {
p <- ggplot(data, aes_string(x = group_var, y = y_var)) +
geom_boxplot() +
theme_minimal() +
stat_compare_means()
if (!is.null(facet_var)) {
p <- p + facet_wrap(as.formula(paste("~", facet_var)))
}
if (!is.null(title)) {
p <- p + ggtitle(title)
}
p
}
# Plot G1 vs G2
plot_g1g2 <- plot_group_boxplot(subset(plot_df, Group %in% c("G1", "G2")), "Group")
# Plot G3 vs G4
plot_g3g4 <- plot_group_boxplot(subset(plot_df, Group %in% c("G3", "G4")), "Group")
# Plot Day -1 vs 10 with facet by Diet
plot_day <- plot_group_boxplot(
subset(plot_df, Day %in% c("-1", "10")),
group_var = "Day",
facet_var = "Diet",
title = "CAP1 by Day and Diet"
)
# You can now print or save the plots:
# print(dba_plot)
# print(plot_g1g2)
# print(plot_g3g4)
# print(plot_day)
let’s identify taxa that are driving the segregation of samples along axis 1 and 2. Importantly, we see many of the same species that were identified by unifrac correlation analysis.
# Extract species scores from db-RDA
taxon_scores <- scores(db_rda, display = "species")
# Convert to data frame and clean
taxon_df <- as.data.frame(taxon_scores) %>%
tibble::rownames_to_column(var = "Taxon") %>%
mutate(across(where(is.list), ~ as.numeric(unlist(.)))) %>%
mutate(across(starts_with("CAP"), as.numeric))
# Extract taxonomy table from phyloseq object
tax_table_df <- as.data.frame(tax_table(phy_filt)) %>%
tibble::rownames_to_column(var = "Taxon")
# Merge with taxon_df to get genus names
taxon_annotated <- left_join(taxon_df, tax_table_df, by = "Taxon")
# Replace missing genus with "Unclassified"
taxon_annotated$Genus <- ifelse(is.na(taxon_annotated$Genus), "Unclassified", taxon_annotated$Genus)
# ---- Plot for CAP1 ----
top_taxa_CAP1 <- taxon_annotated %>%
dplyr::arrange(desc(abs(CAP1))) %>%
dplyr::slice(1:20)
db1 = ggplot(top_taxa_CAP1, aes(x = reorder(Genus, CAP1), y = CAP1)) +
geom_bar(stat = "identity", fill = "darkorange") +
coord_flip() +
labs(title = "Top 20 Taxa Contributing to db-RDA Axis 1",
x = "Genus",
y = "CAP1 Score") +
theme_minimal()+
theme(axis.text.y = element_text(face = "italic"))
# ---- Plot for CAP2 ----
top_taxa_CAP2 <- taxon_annotated %>%
dplyr::arrange(desc(abs(CAP2))) %>%
dplyr::slice(1:20)
db2 = ggplot(top_taxa_CAP2, aes(x = reorder(Genus, CAP2), y = CAP2)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Taxa Contributing to db-RDA Axis 2",
x = "Genus",
y = "CAP2 Score") +
theme_minimal()+
theme(axis.text.y = element_text(face = "italic"))
grid.arrange(db1, db2, ncol=2)
SFig2 = plot_grid(db1, db2, ncol=2, labels = c("A", "B"))
ggsave(SFig2, file="~/Desktop/github/perez_diet/SFig2.pdf", device = pdf, width = 12, height = 8)
let’s plot relative abundance of taxa associated with CAP1 and CAP 2. Notably, many of the taxa identified by correlation with Unifrac analysis above were identified in this analysis.
# Subset to Lactobacillus and Bacteroides
ps_subset <- subset_taxa(phy_filt, Genus %in% c("Lactobacillus", "Bacteroides",
"Barnesiella", "Turicibacter",
"Prevotella", "Clostridium XlVa"))
# Transform to relative abundance or clr
ps_rel <- transform_sample_counts(ps_subset, function(x) x / sum(x))
# Melt to long format for ggplot and clean up
ps_melt <- psmelt(ps_rel)
ps_melt$Genus <- as.character(ps_melt$Genus)
ps_melt$Genus[is.na(ps_melt$Genus)] <- "Unclassified"
ps_melt$Time_label = factor(ps_melt$Time_label, levels=c("T1", "T2", "T3"))
#subset on taxa
Turicibacter = subset(ps_melt, Genus=="Turicibacter")
Bacteroides = subset(ps_melt, Genus=="Bacteroides")
Lactobacillus = subset(ps_melt, Genus=="Lactobacillus")
Barnesiella = subset(ps_melt, Genus=="Barnesiella")
Prevotella = subset(ps_melt, Genus=="Prevotella")
Clostridium = subset(ps_melt, Genus=="Clostridium XlVa")
mycolorpalette =c(
"Lactobacillus" = "#aec7e8",
"Bacteroides" = "#2ca02c",
"Barnesiella" = "#d62728",
"Turicibacter" = "#c49c94",
"Prevotella" = "#e377c2",
"Clostridium XlVa" = "#ff9896"
)
#plot by genus
plot_my_taxa <- function(df, species) {
# Filter for the target genus
df_sub <- df %>% filter(Genus == species)
# Wilcoxon pairwise tests within each Diet, adjust p-values using BH
pval_df <- df_sub %>%
group_by(Diet) %>%
pairwise_wilcox_test(
Abundance ~ Time_label,
p.adjust.method = "BH",
comparisons = mycomparisons
) %>%
ungroup() %>%
add_significance("p.adj") %>%
add_y_position(step.increase = 0.05) # automatic y-positioning
# Plot
ggplot(df_sub, aes(x = Time_label, y = Abundance, fill = Genus)) +
geom_boxplot(position = position_dodge(width = 0.8), outlier.shape = NA) +
geom_point(position = position_jitterdodge(jitter.width = 0.15, dodge.width = 0.8)) +
facet_wrap(~Diet, scales = "free_x") +
scale_fill_manual(values = mycolorpalette) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(face = "italic")
) +
xlab("") +
ylab("Relative Abundance") +
ggtitle(species) +
stat_pvalue_manual(
pval_df,
label = "p.adj.signif",
tip.length = 0.01,
size = 3
)
}
#we drop clostridium because nothing is significant when we clr transform
fig1e1 <- plot_my_taxa(Lactobacillus, species="Lactobacillus") + ylim(0, 0.9)
fig1e2 <- plot_my_taxa(Bacteroides, species="Bacteroides")+ ylim(0, 0.9)
fig1e3 <- plot_my_taxa(Barnesiella, species="Barnesiella")+ ylim(0, 0.9)
fig1e4 <- plot_my_taxa(Turicibacter, species="Turicibacter") + ylim(0, 0.4)
fig1e6 <- plot_my_taxa(Prevotella, species="Prevotella")+ ylim(0, 0.4)
fig1E <- plot_grid(fig1e1, fig1e2, fig1e3, fig1e4,fig1e6,
ncol = 2, align = "hv")
fig1E
save figure 1e
ggsave(fig1E, file="~/Desktop/github/perez_diet/fig1E.pdf", device="pdf", height = 12, width = 7)
ggsave(fig1E, file="~/Desktop/github/perez_diet/fig1E.eps", device="eps", height = 12, width = 7)
let’s see if these differences are robust to clr transformation
# CLR transform the data
ps_clr <- microbiome::transform(ps_subset, "clr")
# Convert to long format
clr_long <- ps_clr %>%
phyloseq::psmelt() %>%
as_tibble()
# Clean up genus names
clr_long$Genus <- as.character(clr_long$Genus)
clr_long$Genus[is.na(clr_long$Genus)] <- "Unclassified"
# Rename the abundance column to "CLR"
clr_long <- clr_long %>% dplyr::rename(CLR = Abundance)
# Ensure factor levels are correct
clr_long$Time_label <- factor(clr_long$Time_label, levels = c("T1", "T2", "T3"))
# Plot function (modified for CLR)
plot_my_taxa_clr <- function(df, species) {
df_sub <- df %>% filter(Genus == species)
# Wilcoxon pairwise tests within each Diet
pval_df <- df_sub %>%
group_by(Diet) %>%
pairwise_wilcox_test(
CLR ~ Time_label,
p.adjust.method = "BH",
comparisons = mycomparisons
) %>%
ungroup() %>%
add_significance("p.adj") %>%
add_y_position(step.increase = 0.05)
ggplot(df_sub, aes(x = Time_label, y = CLR, fill = Genus)) +
geom_boxplot(position = position_dodge(width = 0.8), outlier.shape = NA) +
geom_point(position = position_jitterdodge(jitter.width = 0.15, dodge.width = 0.8)) +
facet_wrap(~Diet, scales = "free_x") +
scale_fill_manual(values = mycolorpalette) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(face = "italic")
) +
xlab("") +
ylab("CLR Abundance") +
ggtitle(species) +
stat_pvalue_manual(
pval_df,
label = "p.adj.signif",
tip.length = 0.01,
size = 3
)
}
fig1clr1 <- plot_my_taxa_clr(clr_long, species = "Lactobacillus")
fig1clr2 <- plot_my_taxa_clr(clr_long, species = "Bacteroides")
fig1clr3 <- plot_my_taxa_clr(clr_long, species = "Barnesiella")
fig1clr4 <- plot_my_taxa_clr(clr_long, species = "Turicibacter")
fig1clr5 <- plot_my_taxa_clr(clr_long, species = "Prevotella")
fig1CLR <- plot_grid(fig1clr1, fig1clr2, fig1clr3, fig1clr4, fig1clr5,
ncol = 2, align = "hv")
fig1CLR
let’s look at whether we see a difference between T1 vs T1, T2 vs T2, T3 vs T3 by diet. At T3, Turicibacter was significantly more abundant in the HOA group than in the control group.
# Run pairwise Wilcoxon tests
pval_clr_df <- clr_long %>%
group_by(Genus, Time_label) %>%
pairwise_wilcox_test(
CLR ~ Diet,
p.adjust.method = "BH"
) %>%
add_significance("p.adj") %>%
add_y_position()
# Plot function per genus
plot_genus_diet_clr <- function(genus_name, clr_long, pval_df) {
df_sub <- clr_long %>% filter(Genus == genus_name)
pval_sub <- pval_df %>% filter(Genus == genus_name)
ggplot(df_sub, aes(x = Diet, y = CLR, fill = Diet)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) +
facet_wrap(~Time_label, scales = "free_y") +
stat_pvalue_manual(
pval_sub,
label = "p.adj.signif",
tip.length = 0.01,
size = 4,
bracket.size = 0.5
) +
scale_fill_manual(values = c("HOA" = "#1f77b4", "control (AIN-93G)" = "#ff7f0e")) +
theme_minimal(base_size = 13) +
theme(legend.position = "none") +
labs(
title = paste0("*", genus_name, "*"),
y = "CLR abundance",
x = "Diet"
) +
theme(plot.title = ggtext::element_markdown(size = 14))
}
# Subset to genera of interest
target_genera <- c("Turicibacter", "Bacteroides", "Lactobacillus", "Barnesiella", "Prevotella")
# Generate and plot selected genera
plots <- lapply(target_genera, function(genus) {
plot_genus_diet_clr(genus, clr_long, pval_clr_df)
})
# Display plots in a grid
cowplot::plot_grid(plotlist = plots, ncol = 3)
plot_aldex_by_time <- function(phy_obj, time_point) {
# Filter samples by Time_label using prune_samples
samples_to_keep <- sample_names(phy_obj)[sample_data(phy_obj)$Time_label == time_point]
if(length(samples_to_keep) == 0) {
stop(paste("No samples found for Time_label =", time_point))
}
phy_sub <- prune_samples(samples_to_keep, phy_obj)
# Extract OTU counts and conditions
otu_mat <- as.data.frame(otu_table(phy_sub))
conds <- sample_data(phy_sub)$Diet
# Run ALDEx2 test
ald <- aldex(
t(otu_mat),
conds,
test = "t",
effect = TRUE,
denom = "all"
)
tax_df <- as.data.frame(tax_table(phy_sub)) %>% rownames_to_column("OTU")
ald_df <- as.data.frame(ald) %>%
rownames_to_column(var = "OTU") %>%
left_join(
dplyr::select(tax_df, OTU, Genus),
by = "OTU"
) %>%
mutate(
signif = wi.eBH < 0.05,
x = rab.all,
y = diff.btw
)
top_hits <- ald_df %>%
filter(signif) %>%
slice_max(order_by = abs(y), n = 10)
p <- ggplot(ald_df, aes(x = x, y = y)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
geom_point(aes(color = signif), alpha = 0.6, size = 1.8) +
scale_color_manual(values = c("grey70", "firebrick")) +
geom_text_repel(
data = top_hits,
aes(label = Genus),
size = 3,
box.padding = 0.3,
point.padding = 0.2,
max.overlaps = 15
) +
labs(
title = paste("ALDEx2 Diet Comparison at", time_point),
x = "Median CLR Abundance",
y = paste("Median CLR Difference (HOA vs control at", time_point, ")"),
color = "Adj. p < 0.05"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "right")
return(p)
}
# Plot ALDEx2 results for each Time point; but only display T2 cause it's the only one with a significant hit
p_T1 <- plot_aldex_by_time(phy_filt, "T1")
p_T2 <- plot_aldex_by_time(phy_filt, "T2")
p_T3 <- plot_aldex_by_time(phy_filt, "T3")
p_T2
let’s plot the relative abundnace of Acetatifactor
phy_filt_ra = transform_sample_counts(phy_filt, function(x) x/sum(x))
phy_acetatifactor <- subset_taxa(phy_filt_ra, Genus == "Acetatifactor")
tax.count <- data.frame(sample_data(phy_acetatifactor), otu_table(phy_acetatifactor)) %>%
reshape2::melt(., colnames(sample_data(phy_acetatifactor)))
#plot
ggplot(tax.count, aes(x = Diet, y = round(100*value, 2), fill = Diet)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, size = 1.5, alpha = 0.6) +
facet_wrap(~ Time_label, scales = "free_x") +
labs(
title = "Relative Abundance of *Acetatifactor* by Diet",
y = "Relative Abundance",
x = "Diet"
) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
# Compose left side: fig1A (panel a), fig1B (panel b)
fig1_top <- ggdraw() +
# Panel a: left column
draw_plot(fig1B, x = 0, y = 0, width = 0.5, height = 1) +
draw_plot_label("B", x = 0.01, y = 0.98, size = 14, fontface = "bold") +
# Panel b: right column
draw_plot(Fig1C, x = 0.5, y = 0, width = 0.5, height = 1) +
draw_plot_label("C", x = 0.51, y = 0.98, size = 14, fontface = "bold")
fig1_bottom <- ggdraw() +
# Panel c: left column
draw_plot(fig1D, x = 0, y = 0, width = 0.5, height = 1) +
draw_plot_label("D", x = 0.01, y = 0.98, size = 14, fontface = "bold") +
# Panel d: right column
draw_plot(fig1E, x = 0.5, y = 0, width = 0.5, height = 1) +
draw_plot_label("d", x = 0.51, y = 0.98, size = 14, fontface = "bold")
# Combine the two columns
Fig1 <- plot_grid(
fig1_top,
fig1_bottom,
ncol = 1,
rel_heights = c(0.5, 1) # top is half the height of bottom
)
# Print
print(Fig1)
ggsave(Fig1, file="~/Desktop/github/perez_diet/Figure1.pdf", width = 11, height = 12)